{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "Working with categorical variables"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn import datasets\n",
    "import numpy as np\n",
    "iris = datasets.load_iris()\n",
    " \n",
    "X = iris.data\n",
    "y = iris.target"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 1.,  0.,  0.],\n",
       "       [ 1.,  0.,  0.],\n",
       "       [ 1.,  0.,  0.],\n",
       "       [ 1.,  0.,  0.],\n",
       "       [ 1.,  0.,  0.]])"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn import preprocessing\n",
    "cat_encoder = preprocessing.OneHotEncoder()\n",
    "cat_encoder.fit_transform(y.reshape(-1,1)).toarray()[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0.,  1.,  0.],\n",
       "       [ 0.,  1.,  0.],\n",
       "       [ 0.,  1.,  0.]])"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cat_encoder.transform(np.ones((3, 1))).toarray()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.linear_model import Ridge\n",
    "ridge_inst = Ridge()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.multioutput import MultiOutputRegressor\n",
    "multi_ridge = MultiOutputRegressor(ridge_inst, n_jobs=-1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from sklearn import preprocessing\n",
    "cat_encoder = preprocessing.OneHotEncoder()\n",
    "y_multi = cat_encoder.fit_transform(y.reshape(-1,1)).toarray()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y_multi, stratify=y, random_state= 7)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "MultiOutputRegressor(estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,\n",
       "   normalize=False, random_state=None, solver='auto', tol=0.001),\n",
       "           n_jobs=-1)"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "multi_ridge.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0.81689644,  0.36563058, -0.18252702],\n",
       "       [ 0.95554968,  0.17211249, -0.12766217],\n",
       "       [-0.01674023,  0.36661987,  0.65012036],\n",
       "       [ 0.17872673,  0.474319  ,  0.34695427],\n",
       "       [ 0.8792691 ,  0.14446485, -0.02373395]])"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y_multi_pre = multi_ridge.predict(X_test)\n",
    "y_multi_pre[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 1.,  0.,  0.],\n",
       "       [ 1.,  0.,  0.],\n",
       "       [ 0.,  0.,  1.],\n",
       "       [ 0.,  0.,  0.],\n",
       "       [ 1.,  0.,  0.]])"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn import preprocessing\n",
    "y_multi_pred = preprocessing.binarize(y_multi_pre,threshold=0.5)\n",
    "y_multi_pred[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.91987179487179482"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.metrics import roc_auc_score\n",
    "roc_auc_score(y_test, y_multi_pre)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Multi-Output Scores for the Iris Flowers: \n",
      "('Accuracy score of flower 0', 1.0)\n",
      "('AUC score of flower 0', 1.0)\n",
      "\n",
      "('Accuracy score of flower 1', 0.73684210526315785)\n",
      "('AUC score of flower 1', 0.76923076923076927)\n",
      "\n",
      "('Accuracy score of flower 2', 0.97368421052631582)\n",
      "('AUC score of flower 2', 0.99038461538461542)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from sklearn.metrics import accuracy_score\n",
    " \n",
    "print (\"Multi-Output Scores for the Iris Flowers: \")\n",
    "for column_number in range(0,3):\n",
    "     print (\"Accuracy score of flower \" + str(column_number),accuracy_score(y_test[:,column_number], y_multi_pred[:,column_number]))\n",
    "     print (\"AUC score of flower \" + str(column_number),roc_auc_score(y_test[:,column_number], y_multi_pre[:,column_number]))\n",
    "     print (\"\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 1.,  0.,  0.],\n",
       "       [ 1.,  0.,  0.],\n",
       "       [ 1.,  0.,  0.],\n",
       "       [ 1.,  0.,  0.],\n",
       "       [ 1.,  0.,  0.]])"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.feature_extraction import DictVectorizer\n",
    "dv = DictVectorizer()\n",
    "my_dict = [{'species': iris.target_names[i]} for i in y]\n",
    "dv.fit_transform(my_dict).toarray()[:5]"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
